In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import xgboost as xgb
import operator
%matplotlib inline

In [5]:
SEED = 2017
NFOLDS = 5

Functions for training and makeing predictions

Function for training a xgb model


In [6]:
def train_xgb(X, y, param):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.005, random_state = SEED)
    
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_val = xgb.DMatrix(X_val, label = y_val)
    
    watchlist = [(xg_train, 'train'), (xg_val), 'eval']
    
    return xgb.train(param, xg_train, param['n_rounds'])

Function for making predictions


In [7]:
def predict_xgb(clf, X_test):
    return clf.predict(xgb.DMatrix(X_test))

Parameters for xgb model


In [8]:
param = {}
param['objective'] = 'multi:softprob'
param['eval_metric'] = 'mlogloss'
param['eta'] = 0.08
param['colsample_tree'] = 0.8
param['subsample'] = 0.8
param['seed'] = SEED
param['max_depth'] = 6
param['n_rounds'] = 350
param['seed'] = 1
param['num_class'] = 3
param['silent'] = 1

In [9]:
def create_feature_map(features):
    outfile = open('xgb.fmap','w')
    i=0
    for feature in features:
        outfile.write('{0}\t{1}\tq\n'.format(i,feature))
        i = i+1
    outfile.close()

Works on features


In [10]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')
n_train = train.shape[0]

In [11]:
target = {'low':2, 'medium':1, 'high':0}
y_train = train['interest_level'].apply(lambda x : target[x])
X_train = train.drop('interest_level', axis=1)
X_test = test
data = pd.concat([X_train, X_test])

In [12]:
listing_id = test['listing_id'].values

Time feature processing


In [13]:
data['created'] = pd.to_datetime(data['created'])
data['month'] = data['created'].dt.month
data['day'] = data['created'].dt.day
data['week'] = data['created'].dt.week
data['dayofweek'] = data['created'].dt.dayofweek
data['dayofyear'] = data['created'].dt.dayofyear
data['quarter'] = data['created'].dt.quarter
data['hour'] = data['created'].dt.hour
data = data.drop(['created'], axis=1)

Categorical feature


In [14]:
cat = ['display_address', 'manager_id', 'building_id', 'street_address']
for i  in cat:
    data[i] = LabelEncoder().fit_transform(data[i])

Non_categorical feature


In [15]:
data['n_photos'] = data['photos'].apply(len)
data['n_features'] = data['features'].apply(len)
data['n_description'] = data['description'].apply(lambda x: len(x.split(' ')))
data['l_description'] = data['description'].apply(len)

In [16]:
data['featurecopy'] = data['features']
data['featurecopy'] = data['featurecopy'].apply(lambda x: ' '.join(x))
tvectorizer = TfidfVectorizer(stop_words = 'english', max_features = 200, ngram_range=(1,1))
data_sparse = tvectorizer.fit_transform(data['featurecopy'])

In [17]:
not_features = ['description', 'features', 'listing_id', 'photos','featurecopy', 'features']
is_feature = [i for i in data.columns if i not in not_features]

In [18]:
data = sparse.hstack([data[is_feature], data_sparse]).tocsr()

In [19]:
X_train = data[:n_train]
X_test = data[n_train:]

In [20]:
#feature_names = is_feature

In [21]:
clf = train_xgb(X_train, np.array(y_train.astype(np.int8)), param)
pred = predict_xgb(clf, X_test)

In [22]:
feature_names = is_feature + ['sparse_%d' % i for i in range(data_sparse.shape[1])]
create_feature_map(feature_names)

In [27]:
importance = clf.get_fscore(fmap ='xgb.fmap')
importance = sorted(importance.items(), key = operator.itemgetter(1), reverse=True)

In [28]:
df = pd.DataFrame(importance, columns = ['feature', 'fscore'])

In [39]:
sb.barplot(x='fscore', y='feature', data = df.head(20))


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x3b95ae48>

In [ ]: